

########## IMPUTATION OF INCOME / EDUCATION PERCENTILE COMBINATIONS ##########


######################
## PACKAGES
######################
library(dplyr)
library(haven)
library(Hmisc)
library(MASS)
library(modi)


#######################################################
## STEP 1: Import covariance over time
#######################################################

covariances <- read_excel("NES_covariances_Norway.xlsx")

#######################################################
## STEP 2: Make variance/covariance matrix 
#######################################################
d <- read_excel("Main dataset for Norway.xlsx")

######## MIDDLE STOP ##########
d$pcFAV_SEX_MEN <- d$MALE_FAV/(d$MALE_FAV+d$MALE_OPP)
d$pcFAV_SEX_WOMEN <- d$FEMALE_FAV/(d$FEMALE_FAV+d$FEMALE_OPP)
d$S_SEX_MALE <- d$MALE_FAV+d$MALE_OPP
d$S_SEX_FEMALE <- d$FEMALE_FAV+d$FEMALE_OPP

for(i in 1:nrow(d)){
  drow <- d[i,]
  # INCOME/SUPPORT COVARIANCE
  d1 <- drow %>% dplyr::select(starts_with("S_HINC"))
  d1 <- as.data.frame(t(d1))
  d1$prop <- d1$V1/sum(d1$V1, na.rm=T)
  d1$cumu <- cumsum(d1$prop)
  d1 <- mutate(d1, score = ((cumu - lag(cumu))/2)+lag(cumu)) # Make percentile midpoint scores
  d1$score[1] <- d1$cumu[1]-(d1$cumu[1]/2) # Make percentile midpoint score for first category since formula above returns NA for that one
  d1$income <- d1$Var1
  d1 <- add_rownames(d1, "income")
  d1$income <- row.names(d1)
  names(d1)[names(d1)=="V1"] <- "n"
  d1 <- dplyr::select(d1, income, score, n)
  d2 <- drow %>% dplyr::select(starts_with("pcFAV_INC"))
  d2 <- as.data.frame(t(d2)) 
  d2$income <- 1:nrow(d2) 
  names(d2)[names(d2)=="V1"] <- "sup"
  d3 <- merge(d1,d2, by="income")
  d3<-na.omit(d3)
  d4<- d3 %>% dplyr::select(score,sup)
  sup_inc_cov<-cov.wt(d4,wt=d3$n)$cov[1,2]
  inc_var<-cov.wt(d4,wt=d3$n)$cov[1,1]
  inc_mean <- weighted.mean(d3$score,w=d3$n)
  # SEX/SUPPORT COVARIANCE
  d1 <- drow %>% dplyr::select(starts_with("S_SEX"))
  d1 <- as.data.frame(t(d1))
  d1$sex <- rownames(d1)
  d1$sex <- ifelse(d1$sex=="S_SEX_MALE",1,0)
  names(d1)[names(d1)=="V1"] <- "n"
  d1 <- dplyr::select(d1, sex, n)
  d2 <- drow %>% dplyr::select(starts_with("pcFAV_SEX"))
  d2 <- as.data.frame(t(d2)) 
  d2$sex <- rownames(d2)
  d2$sex <- ifelse(d2$sex=="pcFAV_SEX_MEN",1,0)
  names(d2)[names(d2)=="V1"] <- "sup"
  d3 <- merge(d1,d2, by="sex")
  cor(d3$sup,d3$sex,use="complete.obs")
  d3<-na.omit(d3)
  d3$prop <- d3$n/sum(d3$n, na.rm=T)
  d4<- d3 %>% dplyr::select(sex,sup)
  sup_sex_cov<-cov.wt(d4,wt=d3$n,method="ML")$cov[1,2]
  sex_var<-cov.wt(d4,wt=d3$n,method="ML")$cov[1,1] # NB: Variance for "sup" NOT CORRECT (Calculated with correct weights below)
  sex_mean <- weighted.mean(d3$sex,w=d3$n)
  # SUPPORT VARIANCE
  d1 <- drow %>% dplyr::select(OVERALL_FAV,OVERALL_OPP)
  d1 <- as.data.frame(t(d1))
  d1$sup<-c(1,0)
  sup_var <- wtd.var(d1$sup,d1$V1) 
  sup_mean <- weighted.mean(d1$sup,w=d1$V1)
  # BUILD VARIANCE COVARIANCE MATRIX
  sup <- c(sup_var,sup_inc_cov,sup_sex_cov)
  inc <- c(sup_inc_cov,inc_var,covariances$cov[covariances$year==drow$ASKED_YEAR])
  sex <- c(sup_sex_cov,covariances$cov[covariances$year==drow$ASKED_YEAR],sex_var)
  varcovar <- data.frame(sup,inc,sex)
  rownames(varcovar) <- c("sup","inc","sex")
  varcovar <- data.matrix(varcovar)
  varcovar # VARIANCE COVARIANCE MATRIX
  means <- c(sup_mean,inc_mean,sex_mean)
  data<-as.data.frame(mvrnorm(n=drow$ORG_DATASET_N,means,varcovar,empirical=T)) # Generate dataset based on this var-covar-matrix
  m1 <- glm(sup ~ inc + I(inc^2) + sex, data=data) # Income, income^2, and sex, as IVs,
  d4 <- expand.grid(inc=c(0.90,0.50,0.10),sex=c(1,0))
  d4$pred <- predict(m1, d4, type="response")
  d$pred_i90_men[i] <- d4$pred[1]
  d$pred_i50_men[i] <- d4$pred[2]
  d$pred_i10_men[i] <- d4$pred[3]
  d$pred_i90_women[i] <- d4$pred[4]
  d$pred_i50_women[i] <- d4$pred[5]
  d$pred_i10_women[i] <- d4$pred[6]
}

############################################################################
############################## END #########################################
############################################################################

write.xlsx(d, "PhD Master dataset 09.07.2021 gender and household.xlsx") # Writes out master dataset with the new variables added





